import re

import requests
from lxml import etree
import time
import random


class LianJiaSpider:
    def __init__(self):
        self.url = 'https://gy.lianjia.com/zufang/guanshanhuqu/pg{}'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}

    def get_html(self, url):
        # headers = {'User-Agent': UserAgent.random}
        for i in range(3):
            try:
                html = requests.get(url=url, headers=self.headers, timeout=5).text
                self.parse_html(html)
            except Exception as e:
                pass

    def parse_html(self, html):
        p = etree.HTML(html)
        div_list = p.xpath('//div[@class="content__list--item"]/div')
        item = {}
        for div in div_list:
            name_list = div.xpath('.//p[1]/a/text()')
            item['name'] = name_list[0].strip() if name_list else None
            # item['model'] = name_list[1].strip() if name_list else None
            # item['direct'] = name_list[2].strip() if name_list else None
            info_list = div.xpath('.//p[2]/span/text()')
            item['floor'] = info_list[1].strip() if info_list else None
            # 去除中间字符空格
            # item['floor'] = "".join(item['floor'].split())
            item['floor'] = re.sub(r"\s+", "", item['floor'])
            unit_list = div.xpath('.//span/em/text()')
            item['unit'] = unit_list[0].strip() if unit_list else None
            item['unit'] = item['unit'] + "元/月"
            address1 = div.xpath('.//p[2]/a[1]/text()')
            address2 = div.xpath('.//p[2]/a[2]/text()')
            address3 = div.xpath('.//p[2]/a[3]/text()')
            item['address'] = address1[0].strip() + address2[0].strip() + address3[0].strip()
            print(item)

    def run(self):
        for pg in range(1, 101):
            url = self.url.format(pg)
            self.get_html(url)
            time.sleep(random.randint(1, 2))


if __name__ == '__main__':
    spider = LianJiaSpider()
    spider.run()
